A short description of the post.
The packages will be used in the assignment.
packages = c('stringr', 'stringi','tidyr','tidyverse', 'dplyr', 'tm', 'lubridate',
'corporaexplorer', 'quanteda','quanteda.textstats',"igraph",
"visNetwork", "tidygraph", "ggraph","networkD3", "stm",
"tidytext", "widyr", "wordcloud",
"ggwordcloud", "textplot")
for (p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p,character.only = T)
}
Overview: In this task, all news articles will be loaded in and format in to a data frame. After cleaning, some features, cosine similarity is computed for each pair of articles. Only the top 0.5% pairs with highest similarity will be analyzed here. The threshold cosine similarity is 0.54. It is based on assumption that pairs of article with cosine similarity above the value possess strong similarity. Within each pair, the article published earlier is considered the primary source and the one published later is considered secondary source. This value forms a new column, “role”, in the data frame. Each article under the top 0.5% pairs is labeled with the article IDs that are similar to it, both published earlier and later. New column is named as, “similar_earlier” and “similar_later”. This is to enable easier visualization in the interactive corpus explorer. A interactive directed network graph is used to visualize connections between sources. After adding in new columns, corpus data frame is visualized with corpus explorer with search function. Steps and demonstration are described below.
1. Get the list of articles in all paper sources
list_of_paper <- list.files(path = "MC1/News Articles", recursive = TRUE,
pattern = ".",
full.names = TRUE)
2. Load in each article in a iterative way and save them into a list of data frames
df_list <- list()
num <- 1
for (i in list_of_paper){
temp <- lapply(i, readLines)
temp <- lapply(1:length(temp),
function(j) data.frame(
news_no=str_extract(i, "(?<=\\/)\\d+"),
rawdata=temp[[j]],
stringsAsFactors = FALSE))
df_temp <- do.call(rbind, temp)
df_temp[,c("type","entry")] <-
str_trim(str_split_fixed(df_temp$rawdata,":",2))
df_temp <- df_temp[,c("news_no","type","entry")]
df_temp <- pivot_wider(df_temp, names_from = type, values_from = entry)
df_list[[num]] <- df_temp
num <- num+1
}
3. Bind all data frames in the list
df <- do.call(rbind.fill,df_list)
1. check data types
formats <- c("%d %B %Y","%d%B %Y","%B %d, %Y","%Y/%m/%d")
df <-df %>%
mutate(date = parse_date_time(PUBLISHED, formats))
df$date <- as.Date(as.POSIXct(df$date,tz="GMT"))
df$news_no <- as.integer(df$news_no)
2. change column name and adjust column order in order to load to corpus
3. This data frame will be used as the main corpus to visualize in corpus explorer. No stopword and punctuation are removed in order to not affect reading. Save the data.
write_rds(df_corpus, "rds/original_corpus")
1. Use package “quanteda” to format corpus into a dfm and clean the corpus by removing stop words and punctuation. Calculate cosine similarity between each pair of document and convert to a data frame.
news_corpus <- corpus(df_corpus, docid_field="Doc_id", text_field="Text")
clean_corpus <- dfm(news_corpus, remove=stopwords("english"), remove_punct=TRUE)
sim <- textstat_simil(clean_corpus, margin="document", method="cosine")
sim_df <- as.data.frame(sim)
sim_df$document1 <- as.integer(as.character(sim_df$document1))
sim_df$document2 <- as.integer(as.character(sim_df$document2))
2. create a table of source for merging later
df_source <- df_corpus %>%
select(Doc_id,Source,date)
sim_tmp <- sim_df %>%
rowid_to_column(var="pair_id") %>%
mutate(rank=ntile(cosine,200)) %>%
filter(rank==200) %>%
left_join(df_source, by=c("document1"="Doc_id"), suffix=c("1","2")) %>%
left_join(df_source, by=c("document2"="Doc_id"), suffix=c("1","2")) %>%
filter(Source1 != Source2) %>%
filter(date1 != date2) %>%
transform(document1=ifelse(date1<=date2, document1, document2),
document2=ifelse(date1<=date2, document2, document1)) %>%
transform(Source1=ifelse(date1<=date2, Source1, Source2),
Source2=ifelse(date1<=date2, Source2, Source1))
df_primary <- sim_tmp %>%
group_by(document1) %>%
dplyr::summarise(n = n()) %>%
mutate(role="primary")
df_secondary <- sim_tmp %>%
group_by(document2) %>%
dplyr::summarise(n = n()) %>%
mutate(role="secondary")
#join the new column to df_corpus
df_join <- df_corpus %>%
left_join(df_primary, by=c("Doc_id"="document1"), suffix=c("_pri","_sec")) %>%
left_join(df_secondary, by=c("Doc_id"="document2"), suffix=c("_pri","_sec")) %>%
unite("role", n_pri, role_pri, n_sec, role_sec, sep="_",na.rm=TRUE)
glimpse(df_join)
Rows: 845
Columns: 9
$ Doc_id <int> 121, 135, 152, 154, 237, 251, 341, 391, 420, 554, …
$ Text <chr> "Fifteen members of the Protectors of Kronos (POK)…
$ Source <chr> "All News Today", "All News Today", "All News Toda…
$ Title <chr> "POK PROTESTS END IN ARRESTS", "RALLY SCHEDULED IN…
$ Author <chr> NA, NA, NA, "Petrus Gerhard", NA, NA, NA, NA, NA, …
$ date <date> 2005-04-06, 2012-04-09, 1993-02-02, 1998-03-20, 1…
$ Location <chr> "ELODIS, Kronos", "ABILA, Kronos", "ABILA, Kronos"…
$ Note <chr> NA, NA, NA, "This article is the first in a series…
$ role <chr> "1_primary_1_secondary", "4_primary", "3_primary",…
pair_pri <- sim_tmp %>%
select(cosine,pair_id,document1,document2) %>%
group_by(document1) %>%
arrange(document1,desc(cosine)) %>%
dplyr::summarize(similar_later=paste(document2, collapse="_"))
pair_sec <- sim_tmp %>%
select(cosine,pair_id,document1,document2) %>%
group_by(document2) %>%
arrange(document2,desc(cosine)) %>%
dplyr::summarize(similar_earlier=paste(document1, collapse="_"))
#join the column to df_join
df_join <- df_join %>%
left_join(pair_pri, by=c("Doc_id"="document1")) %>%
left_join(pair_sec, by=c("Doc_id"="document2"))
glimpse(df_join)
Rows: 845
Columns: 11
$ Doc_id <int> 121, 135, 152, 154, 237, 251, 341, 391, 420…
$ Text <chr> "Fifteen members of the Protectors of Krono…
$ Source <chr> "All News Today", "All News Today", "All Ne…
$ Title <chr> "POK PROTESTS END IN ARRESTS", "RALLY SCHED…
$ Author <chr> NA, NA, NA, "Petrus Gerhard", NA, NA, NA, N…
$ date <date> 2005-04-06, 2012-04-09, 1993-02-02, 1998-0…
$ Location <chr> "ELODIS, Kronos", "ABILA, Kronos", "ABILA, …
$ Note <chr> NA, NA, NA, "This article is the first in a…
$ role <chr> "1_primary_1_secondary", "4_primary", "3_pr…
$ similar_later <chr> "760", "414_239_207_773", "639_210_748", "3…
$ similar_earlier <chr> "221", NA, NA, "69", NA, "688_297", "290", …
The shiny app is first deplyed to shiny.io and then embedded in the R markdown. For full view, please click here to view the app in a browser.
Next, create nodes list and edges list for visualizing dependencies network between sources. This network graph will be used together with interactive corpus explorer and interactive table to gain insights.
sim_nodes <- df_source %>%
group_by(Source) %>%
summarise(value=n()) %>%
rename(label=Source) %>%
rowid_to_column("id") %>%
mutate(title=paste("<p>",label,"<br>",value,"documents in total</p>",sep=" "))
sim_edges <- sim_tmp %>%
group_by(Source1, Source2) %>%
summarise(value=n()) %>%
left_join(sim_nodes, by=c("Source1"="label")) %>%
rename(from=id) %>%
left_join(sim_nodes, by=c("Source2"="label")) %>%
rename(to=id) %>%
select(from, to, value.x,Source1, Source2) %>%
rename(value=value.x) %>%
mutate(title=paste("<p>Similar Articles<br>primary:",Source1, "</br>",
"<br>secondary:",Source2,"</br>",
value,"similar documents</p>", sep=" ")) %>%
select(Source1, Source2, from, to, value, title)
visNetwork(sim_nodes, sim_edges) %>%
visEdges(arrows = 'middle', ) %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
visIgraphLayout(layout="layout_in_circle")
The table shows the number of articles similar
DT::datatable(sim_edges %>% select(Source1, Source2, value),
colnames=c("Primary", "Secondary","Similar_doc_count"),
caption= htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
htmltools::em("Interactive table for aggregated view of sources
and similar documents")),
filter="top",
rownames=FALSE,
options = list(pageLength = 7, autoWidth = FALSE))
Overview: Corpus explorer from Task 1 will be used again to view each text. By searching for key terms, “POK”, “APA”, “government”, and “GASTech”, we aim to find out if there is any bias held against these terms in each day source. Additionally, word cloud is plotted for each source to find frequently appeared words.
From the background information, we understand that POK(protectors of kronos) was established to advocate environmental protection in Kronos. A key incident in the growth of POK was death of Juliana Vann, a 10-year-old girl who died from leukemia, directly or indirectly resulted from water pollution. Water pollution was contributed mostly by GASTech factory in Kronos.
In general, POK is against GASTech and Kronos government did little to help with the water pollution. Because POK is clearly on the opposite side of government and GASTech, in exploring the corpus, we filter out 2 kinds of news:
+ those mention both POK and government
+ those mention both POK and GASTech
By analysing these texts, we aim to find opinions held by news sources.
The shiny app is from previous section. Search for POK, government and gastech to highlight in corpus.
Red indicates POK, blue indicates government and green indicates gastech. Each bar plots the frequency of word occurence in the source.
From the plot, clearly there is barely no mentioning of POK in Centrum Sentinel, Modern Rubicon, Tethys News, The General Post, The light of Truth, The Tulip, The World, and Who What News. These sources tend to report more on GASTech with some news on government.
From corpus explorer, we can understand that Centrum Sentinel, Modern Rubicon, Tethys News report update on the kidnap incident with timestamp. They only focus on reporting the incident.
Referring to the picture below, each source has 0 or no mentioning of POK. The background information has revealed that government was planning on tax relief for gas and oil to encourage companies such as GASTech. Since we assume government and gastech are on the opposite side of POK, these sources can be categorised as neutral-favors gastech/government.
In most of the sources, GASTech is the most reported topic. This is especially so in Homeland Illumination, Kronos Star, News Online Today, The Abila Post. As Group 1 contains many sources who reports mainly on Gastech and the government, group 2 excludes those in group even though there were originally some overlaps. This group reports all parties but with a focus on gastech. They are in general more neutral than group 1.
To understand if there was any bias in reporting, additional search fields are set: rally and protest. Only corpus containing “rally” is shown.
The above mentioned 4 sources are all among the list. Even though GASTech was mentioned a lot in this category, the news never fails to discuss about key incidents about POK.
Especially Homeland Illumination which uses long articles to discuss about POK. The data table below enables searching for specific sentence containing the keywords.
df_sentence <- df_join %>%
separate_rows(Text, sep="[.]") %>%
select(Source, Text) %>%
mutate(Text=str_trim(Text))
The breakdown sentences shows that Homeland Illumination is in a supportive role of POK. Some sentences such as, “Elian Karel, age 28, died Friday while unlawfully incarcerated at the Abila City Prison” in one of the articles showed sympathy for the POK leader’s death in prison.
While the rest of the sources in this group do not have comments in the reports.
Homeland Illumination - In favor of POK Kronos Star, News Online Today, The Abila Post - Neutral
Some sources have roughly equal coverage in three categories. These sources include All News Today, Athena Speaks, Central bulletin, Daily Peagasus, Everyday News, International Times, News Deck, The Continent, The Explainer, The Orb, The Wrap, World of Journal.
These sources report more on POK as compared to the coverage of government and GASTech. Such sources are International News, The guide, The Truth, Worldwise.
Word cloud graphs are plotted according the previous grouped categories.
Disinterest in POK
stop <- c(stop_words$word,"kronos", "elodis")
word_freq <- df_join %>%
select(Source, Text) %>%
unnest_tokens(Word, Text) %>%
filter(str_detect(Word, "[a-z']$"),
!Word %in% stop) %>%
count(Source, Word, sort=TRUE) %>%
ungroup()
set.seed(2021)
par(mar=rep(0,4), #remove remove margin
mfrow=c(2,3)) # for 1 row, 2 cols
word_freq %>%
filter(Source == "The General Post") %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "The Light of Truth") %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "The Tulip") %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "The World") %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "Who What News") %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
It is clear that these 5 sources: The General Post, The light of Truth, The Tulip, The World, and Who What News reports mostly on gastech and Sten Sanjorge who is the CEO of GASTech.
Passion in GASTech
set.seed(2021)
par(mar=rep(0,4), #remove remove margin
mfrow=c(2,2)) # for 1 row, 2 cols
stop2 <- c("gastech","government") #define additional word to be removed
word_freq %>%
filter(Source == "Homeland Illumination",
!Word %in% stop2) %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "Kronos Star",
!Word %in% stop2) %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "News Online Today",
!Word %in% stop2) %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
word_freq %>%
filter(Source == "The Abila Post",
!Word %in% stop2) %>%
with(wordcloud(Word, n, min.freq = 2,
min_font_size=1, max_font_size=6,
max.words=100, random.order=FALSE, scale=c(2,0.3), rot.per=0.35,
colors=brewer.pal(8, "Dark2")))
There are a lot of mentioning of POK leader, Elian Karel, in these sources. In addition, contamination, water, health, drug, protectors are frequently mentioned in these sources. These usually appear in the slogans in POK rally and protests.
install chorddiag and load library. Chord diagram will be used to analyze email conversation between employees.
devtools::install_github("mattflor/chorddiag")
library(chorddiag)
Read in employee records to form nodes. Format the columns of dates to date format. A new column “Name” is created to replace FirstName and LastName. Column “Age” is created from BirthDate. Age is calculated until 2014 when the kidnap incident happened.
employee_records <- read_csv("MC1/EmployeeRecords.csv")
employee_records <- employee_records %>%
mutate(Name=paste(FirstName, LastName, sep=".")) %>%
transform(Name=sub(" ", ".", Name)) %>%
transform(BirthDate=parse_date_time(BirthDate,"%d/%m/%y"),
CitizenshipStartDate=parse_date_time(CitizenshipStartDate, "%d/%m/%y"),
PassportIssueDate=parse_date_time(PassportIssueDate, "%d/%m/%y"),
PassportExpirationDate=parse_date_time(PassportExpirationDate,
"%d/%m/%y"),
CurrentEmploymentStartDate=parse_date_time(CurrentEmploymentStartDate,
"%d/%m/%y"),
MilitaryDischargeDate=year(parse_date_time(MilitaryDischargeDate,
"%d/%m/%y"))) %>%
mutate(Age=ifelse(year(BirthDate)>2014,
2014-year(BirthDate)+100, 2014-year(BirthDate))) %>%
select(Name, Age,everything(), -FirstName, -LastName, -BirthDate)
Make row id a new column “id” as the node id. Name is the node label. Rename some columns for easier understanding.
employee_nodes <- employee_records %>%
select(Name,Gender,Age,CurrentEmploymentType,CurrentEmploymentTitle,
CitizenshipCountry) %>%
rename(label=Name, Department=CurrentEmploymentType,
Title=CurrentEmploymentTitle, Country=CitizenshipCountry) %>%
arrange(label) %>%
rowid_to_column("id")
Load in edges list from email headers. Separate SentDate and SentTime. Because all emails were sent in 2014 Jan, additional column is created for day of month. This will be used to visualize the number of emails in each day by each sender, considering a email will be sent to multiple receiver. Another column, “Role”, is created to indicate if the email is a direct email or reply to other emails.
employee_email <- read.csv("MC1/email headers.csv", encoding="UTF-8")
employee_email_agg <- employee_email %>%
transform(Subject=stringi::stri_enc_toascii(Subject)) %>%
separate_rows(To, sep=",") %>%
separate(From, c("From","FromEmail"), sep="@") %>%
separate(To, c("To","ToEmail"), sep="@") %>%
mutate(To=str_trim(To)) %>%
transform(Date=parse_date_time(Date, c("mdy_hm","mdy"))) %>%
transform(SentDate=date(Date),
SentTime=format(Date, format="%H:%M")) %>%
mutate(Type=ifelse(str_detect(Subject, "RE:")==TRUE, "reply","direct")) %>%
select(From, To, SentDate, SentTime, Type, Subject) %>%
rename(Source=From, Target=To) %>%
transform(Source=sub(" ", ".", Source),
Target=sub(" ", ".", Target))
Manipulate table to detect any employee who sends email to all employees, i.e. send to 53 recipients excluding himself or herself. Group the records by source, day, type, subject. The table below indicates that only 2 people who send to the whole company, Mat Bramar and Ruscella Mies Haber. They both work in administration department. Mat is assistant to CEO. From the subjects, we can conlude that he regularly sends announcement, reminders, and matters that relate to the whole company, such as “All staff announcement” and “Changes to travel policy”.
Ruscella is assistant to engineering group manager. She sends only 2 types of emails to the whole company, “Daily morning announcements” and “Good morning, GasTech!” Both are daily news updates for the whole company to read.
email_eachday <- employee_email_agg %>%
filter(Source!=Target) %>%
group_by(Source, SentDate, Type, Subject) %>%
summarise(n=n()) %>%
filter(n==53) %>%
select(Source, SentDate, Subject) %>%
arrange(Source,Subject)
DT::datatable(email_eachday, filter="top",
caption= htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
htmltools::em("Employee who sent emails to the whole company")),
rownames=FALSE,
options = list(pageLength = 7, autoWidth = FALSE))
The interactive diagram below shows the exchange of email from all employees.
email_matrix <- employee_email_agg %>%
select(Source, Target) %>%
group_by(Source, Target) %>%
summarise(n=n())
email_matrix <- graph.data.frame(email_matrix, directed=TRUE)
email_matrix <- get.adjacency(email_matrix, attr="n", sparse=TRUE)
chorddiag(as.matrix(email_matrix),
width=500, height=500,
groupnamePadding=20, groupPadding=3, groupnameFontsize=10,
showTicks=FALSE)
There is a large number of emails sent from and received by Mat Bramar and Ruscella Mie Haber. This is because they are responsible for administrative task, such as sending announcements and reminder. From previous interactive table, we have seen that they send many administrative emails to the whole compnay.
Next, we will remove such exchanges that were sent to all employees for better interpretation of the emails. This includes both directed email and reply emails to all.
employee_email_agg2 <- email_eachday %>%
mutate(delete="yes") %>% #mark all emails in email_eachday as "yes" in delete
right_join(employee_email_agg, by=c("Source"="Source",
"Subject"="Subject",
"Type"="Type",
"SentDate"="SentDate")) %>%
filter(is.na(delete)) %>% #only keep those without a lable in delete
filter(Source!=Target) %>%
select(Source, Target, SentDate, SentTime, Subject, Type)
email_matrix2 <- employee_email_agg2 %>%
select(Source, Target) %>%
group_by(Source, Target) %>%
summarise(n=n())
email_matrix2 <- as.matrix(get.adjacency(graph.data.frame(
email_matrix2, directed=TRUE),
attr="n", sparse=TRUE))
chorddiag(email_matrix2,
width=500, height=500,
groupnamePadding=20, groupPadding=3, groupnameFontsize=10,
showTicks=FALSE)
Prepare nodes and edges table for visNetwork.
employee_nodes <- employee_nodes %>%
#add tooltip column
mutate(title=paste("<p>",label,Gender,Age,"</br>",
"<br>",Department, Title,"</br></p>",sep=" ")) %>%
rename(group=Department)
employee_edges <- employee_email_agg2 %>%
left_join(employee_nodes%>%select(id,label),
by=c("Source"="label"), suffix=c(".Source", ".Target")) %>%
left_join(employee_nodes%>%select(id,label),
by=c("Target"="label"), suffix=c(".Source", ".Target")) %>%
rename(from=id.Source, to=id.Target) %>%
group_by(from, to, Source, Target, Type) %>%
summarise(Sum=n()) %>%
ungroup() %>%
group_by(from, to, Source, Target) %>%
mutate(value=sum(Sum)) %>%
mutate(Tooltip=paste(Sum, Type)) %>%
select(-Sum, -Type) %>%
mutate(Tooltip=paste(Tooltip, collapse=", ")) %>%
mutate(title=paste("<p>from",Source,"to",Target, "</br>",
Tooltip, "</br>")) %>%
ungroup() %>%
distinct(from, to, value, title)
visNetwork(employee_nodes, employee_edges) %>%
visEdges(arrows = 'to', ) %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
visLegend() %>%
visIgraphLayout(layout="layout_with_fr")
Visualize together with interactive table of all emails.